### FIGURE 3 ###
library(pheatmap) # Heatmap
library(dplyr) # Dataframe process  
library(tidyr) 
library(RColorBrewer) # Colors 
library(ggplot2) # Draw plots
library(ggpubr)
library(cowplot) # Combine plots into grid or layout
library(ggplotify) # Transform plots into ggplot object
library(ggrepel)
library(grid) # Grid
library(gridExtra) 
library(lattice) # Draw plots
library(ggtreeExtra)

# PANEL A 
# Loading data
matrix_all_ph <- read.table(file="matrix_90_ab_1prcst_cl.tsv",header=T,sep="\t") 

# Sum all prophages
matrix_all_ph_sum <- matrix_all_ph %>% mutate(total_phages = rowSums(select(.,2:352)))

# Selecting columns
matrix_all_ph_sum <- matrix_all_ph_sum [-1,]
data_a <- select(matrix_all_ph_sum, X, total_phages)

# Draw plot
A <- ggplot(data_a, aes(x = total_phages)) + geom_density(stat = "count",color="#548B54",fill= "#9AFF9A", alpha = 0.7, adjust = 2.5 ) + theme_minimal() + 
  scale_x_continuous(breaks= seq(1,8,by=1)) + xlab("No. of prophages") + ylab("No. of genomes ") + theme(axis.text = element_text(size= 13)) 

# PANEL B
library(ComplexUpset)
library(tidyverse)

# Loading data
bin_matrix <- read.table(file = "./ab_defense-finder/binary_matrix_wophages.tsv", sep = "\t", header = TRUE, row.names = 1)
data_noph <- read.table(file="./ab_defense-finder/defense_finder_systems_wophages_nored100.tsv", sep="\t", col.names = c("strains","types"))

# Rename of defense system types
colnames(bin_matrix)[42] <- 'R-M'
colnames(bin_matrix)[32:38] <- c("PD-Lambda-1","PD-Lambda-2","PD-Lambda-5","PD-T4-5","PD-T4-7","PD-T7-2","PD-T7-5")
colnames(bin_matrix)[53] <- 'Ssp'

# Filter
counts_st_noph <- data_noph %>% group_by(types) %>% tally() # Count
drop_type <- counts_st_noph[counts_st_noph$n < 17,]$types # List of types to drop which are in less than 10% of the total number of genomes without phages

types_def <- c("Septu","Gabija","CBASS","Cas","RosmerTA","R-M","PD-Lambda-2","PD-T7-5","Gao_Qat","Ssp","Other") # List of types which are present in more than 10% of the genomes without phages

# Draw Upset plot
B <- upset(bin_matrix, types_def, name = "Defense systems (DSs)", n_intersections = 30, sort_intersections_by = "cardinality", sort_sets ="descending",
         set_sizes = upset_set_size() + ylab('No. of genomes with the indicated DS'),
         base_annotations = list(
           "Intersection size"=intersection_size()+ylab('No. of genomes with 
    the indicated combs. of DSs')
         ), matrix = intersection_matrix(
           geom = geom_point(size = 4, shape ="circle filled") # Shape and size of the intersection geom
         ),
         stripes = "white", # Colours of the row background
         queries = list( # Fill colours of bars and dots
           upset_query(set='Ssp',fill="#FB8072"),
           upset_query(set='Gao_Qat',fill="#FDB462"),
           upset_query(set='PD-T7-5',fill="#4DAF4A"),
           upset_query(set='PD-Lambda-2',fill="#7fc97f"),
           upset_query(set='R-M',fill="#BC808D"),
           upset_query(set='RosmerTA',fill="#E78AC3"),
           upset_query(set='Cas',fill="#BEBADA"),
           upset_query(set='CBASS',fill="#8DD3C7"),
           upset_query(set='Gabija',fill="#FFFFB3"),
           upset_query(set='Septu',fill="#80B1D3"),
           upset_query(set='Other',fill="gray")
         )
)


# PANEL C
# Loading data 
mlst_file <- read.table(file="mlst_ab_freq_wored100.tsv",header=FALSE,sep="\t", col.names = c("ID","mlst"))
metadata_file <- read.table(file="metadata_ab_is.tsv", header = TRUE, sep="\t")
subregion_file <- read.table(file="metadata_ab_subregions.tsv", header = TRUE, sep = "\t", quote = ",")

mlst_md <- merge(mlst_file, metadata_file, by = "ID") # Join both dataframes

# Filter 
count<- mlst_md %>% group_by(mlst,Isolation.type) %>%tally() # Count
count<- count %>% mutate(per= round(n/sum(n)*100,2)) # Calculate percentage
count <- replace(count, count=="","other") 
count <- replace(count,count=="environmental/other","environment")

#count$Subregion[count$Subregion == ""] <- NA 

# Draw plot
C<-ggplot(count, aes(x = factor(mlst,levels = c("ST2","ST79","ST1","ST3","ST499","ST10","ST78","ST25")), y= per, fill=Isolation.type)) + geom_bar(stat='identity') + 
  geom_text(aes(label=n),size = 4, position=position_stack(vjust=.5)) + theme_minimal() + labs(x = "", y = "Percent of genomes", fill = "Isolation source")+
  scale_fill_manual(values=c("#EEA2AD","darkseagreen2","#7EC0EE"), na.value = "gray90") + ylim(0,100.1) + theme(axis.text.x= element_text(size= 13), axis.text.y = element_text(size=11))




# PANEL D
library(ggtree)
library(ggnewscale)
library(treeio)
library(ape)
library(ggtreeExtra)

# Loading data 
Pmatrix <- read.csv("matrix_90_ab_cl_freqmlst8_wored100.tsv", header = T, sep = '\t',row.names = 1)
Pmatrix_all <- read.table("matrix_90_ab_1prcst_cl.tsv",sep="\t", header =T, row.names = 1)
Pmatrix_all <- Pmatrix_all[-1,]
tree <- read.tree("phylogeny_R/iqtree_Sept2024_def.treefile")
tree <- root(tree, outgroup = "ab04946", resolve.root = TRUE) # Root of the tree
ds <- read.table("phylogeny_R/defsys_presaus_ann.tsv", sep="\t", stringsAsFactor = TRUE, header = T, row.names = 1)
# Rename columns
colnames(ds)[1:4] <- c("R-M_Type_I","R-M_Type_II","R-M_Type_III","R-M_Type_IV")
colnames(ds)[11:13] <- c("PD-T4-5","PD-T7-5","PD-Lambda-2")
####
groups <- read.table("phylogeny_R/groups.tsv", sep="\t", stringsAsFactors = TRUE, header = T, row.names = 1)
mlst <- read.table("mlst_ab_freq_wored100.tsv", sep="\t",stringsAsFactor = FALSE)

# Define the main MLST groups in the tree
st123 <- split(mlst$V1, mlst$V2)
otus <- groupOTU(tree, st123)

# Draw phylogeny
phylo_tmp <- ggtree(otus,aes(color = group), layout = 'rectangular', branch.length = 'bootstrap', size = 0.8) +
  geom_treescale(x = 0,y = 6100, offset = 10) + 
  vexpand(0.15, direction = 1) +
  scale_color_manual(values = c("black", "#E41A1C", "#377EB8","goldenrod1", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628", "#F781BF"),
                     breaks = c(0,"ST1","ST2","ST3","ST10","ST25","ST78","ST79","ST499"), labels=c("other", "ST1", "ST2", "ST3","ST10", "ST25", "ST78", "ST79", "ST499"), guide = "none") +
  labs(color = "MLST") + guides(color = guide_legend(ncol=2))  + theme(legend.position = 'none') 

nodes_mlst <- c(8103,8105,8110,8111,8277,8567,8577,8580,10520,9098,9932,9929,9930,9928,9099, 9100,9101,9119) # Nodes separating the main clades
phylo_tmp<-phylo_tmp+ geom_point2(aes(x=x, y=y, subset= label >= 80),data=phylo_tmp$data[phylo_tmp$data$node %in% nodes_mlst, ], color = "#66CD00", size =3) # Point out the good bootstrap values of only the principal nodes

# Add the groups metadata column
phylo0 <- gheatmap(phylo_tmp, groups, width= 0.1, colnames_angle = 70, colnames_position =  "top", colnames_offset_y = 40,
                   offset= -0.01,font.size = 5, hjust = 0, color = NA) + scale_fill_manual(name="Groups",values= c("#EE8262","#BFEFFF"), guide="none")+ theme(text = element_text(size =20)) +
  annotate(geom = "text", x = 0.195, y = 1400, label = "Group1", color = "black",
           angle = 90) +
  annotate(geom = "text", x = 0.195, y = 4400, label = "Group2", color = "black",
           angle = 90)

# GEOGRAPHICAL INFORMATION

geo <- read.table("metadata_ab_subregions.tsv", sep="\t", quote =",", header=T)

rownames(geo) <- geo$ID 
geo123 <- geo %>% select(Subregion)

metadata$condition <- factor(metadata$condition, levels = c("C", "B", "A")) 
geo123$Subregion <- factor(geo123$Subregion, levels = c("Nothern Europe", "Western Europe","Southern Europe","Eastern Europe", "Southern Asia","South-eastern Asia", "Western Asia", "Eastern Asia","Northern Africa","Sub-Saharan Africa","Northern America","Latin America and the Caribbean", "Australia and New Zealand"))


# Unique regions in the order you defined
regions <- c(levels(geo123$Subregion),"Unknown")

# Define a color for each (example using RColorBrewer)
library(RColorBrewer)
country_colors <- setNames(c("#104E8B","steelblue2","#98F5FF","aquamarine3","firebrick1","#FF6A6A","#FF3E96","#FFA07A","darkgoldenrod1","#CDAD00","#68228B","#AB82FF","#66CD00","gray90"), regions)


phylo00 <- phylo0 + new_scale_fill() 
phylo01 <- gheatmap(phylo00, geo123, colnames_position = "top", colnames_angle = 70, colnames_offset_y = 450,  colnames_offset_x = 0.01,
                    offset=0.01, font.size = 4, width = 0.12, color = NA) + scale_fill_manual(values = country_colors, na.value = "white", guide = "none") + labs(fill="Subregions")

# Add the defense systems metadata columns 
getPalette <- colorRampPalette(brewer.pal(12, "Set3")) # , brewer.pal(9, "Set1")))
mis_colores <- getPalette(14)
phylo1 <- phylo01 + new_scale_fill()
phylo1 <- gheatmap(phylo1, ds, width = 1.6, colnames_angle = 70, colnames_position = "top", colnames_offset_y = 40, 
                   offset=0.040, font.size = 4, hjust = 0, color = NA) + 
  scale_fill_manual(name = "Defense systems",na.value = "white", values = mis_colores, na.translate = FALSE) + guides(position="none")+
  theme(text = element_text(size = 18)) 

# Add prophage presence/absence matrix
# Sorting the prophages (according to their frequence in MLST groups)
col_vector <- c("DgiS1","P1003","P1017","P1055","P1231","P1233","P1311","P1023","P1165","P1068","P1184","P1074","P1372","P1331","P1076",
                "P1012","P1179","P1151","P1054","P1251","P1371","P1041","P1050","P1011","P1002","P1009","P1240","P1471","P1075","P1105","P1339",
                "P18","P2140","P1035","P1161","P1255","P1491","P1661","P1140","P1167","P1059","P1115","P1114","P1031","P1310","P1183","P1107")
Pmatrix_sorted <- Pmatrix[, match(col_vector, colnames(Pmatrix))]
Pmatrix_sorted <- ifelse(Pmatrix_sorted == 0, FALSE, TRUE) # Establish boolean factors in the matrix

phylo2 <- phylo1 + new_scale_fill()
phylo3 <- gheatmap(phylo2, Pmatrix_sorted, width = 3.4, colnames_angle = 90, colnames_position = "top", colnames_offset_y = 40, # width = 2.8
                   offset = 0.37, font.size = 4, colnames = TRUE, hjust = 0, color = NA) + 
  scale_fill_manual(na.value= "white", values = c("white","#804AD5"), na.translate = FALSE, labels=c("Absence","Presence"), guide = "none") +
  labs(fill = "Prophages") + annotate(geom="text", x=0.9, y = 8550, label = "Prophages presence/absence")

# Add count barplots of total prophages 
nphages<- rowSums(Pmatrix_all, na.rm = FALSE, dims = 1) # Sum all prophages by row
nphages <- data.frame(nphages) 
nphages$Strain <- rownames(nphages)
nphages$col_flag <- nphages$nphages > 0 # Fill absence and presence with differente colours

nphages$nphages <- ifelse(nphages$nphages == "0", -1, nphages$nphages) # Negative values to point out the absence of prophages
nphages$nphages <- as.numeric(nphages$nphages) 

# Draw the final phylogeny
phylo4 <- phylo3 + new_scale_fill()
fig <- phylo4 + geom_fruit(
  data = nphages,
  geom = geom_col,
  pwidth = 0.5,
  mapping = aes(y = Strain, x = nphages, fill = col_flag), offset = 5.55, 
  axis.params = list(
    axis       = "x",
    text.size  = 5,
    hjust      = 0.5,
    vjust      = -53.7, # Change 
    title      = "No. prophages",
    title.size = 5
  )
) + scale_fill_manual(name = "No. prophages", values = c(`TRUE` = 'palegreen2',`FALSE` = "#CD2626"), na.value= "white", labels=c("0", ">0")) + # Fill bars
  annotate("rect", xmin = 1.207, xmax = 1.227, ymin = 7900, ymax = 8200, fill = "white") # White square to hide -2 label


#phylo4 + geom_zoom_clade(node = 10794, xexpand = 6.5) ## Zoom in a clade (by node number)
#subtree <- viewClade(phylo4, node=10794)
#print(subtree)
# PANEL E 
# Loading data 
matrix_ph_mlst<- read.table(file="matrix_90_ab_ml_1prcst_nored100_cl_mlst.tsv",header=T,sep="\t", row.names = 2)
matrix_ph_mlst$X <- NULL

# Calculate total number of prophages 
matrix_ph_mlst <- matrix_ph_mlst %>% mutate(total_phages = rowSums(select(.,1:351)))
matrix_ph_mlst <- matrix_ph_mlst[-1,]

# Filter
data_d <- select(matrix_ph_mlst, mlst,total_phages) 
data_d[data_d$mlst =="",]$mlst <- 'Other'
st_counts <- data_d %>% group_by(mlst) %>% tally()

# Statistics 
print(shapiro.test(data_d[data_d$mlst == "ST2",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST79",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST1",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST3",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST499",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST10",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST78",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "ST25",]$total_phages))
print(shapiro.test(data_d[data_d$mlst == "Other",]$total_phages))

kwtest<- kruskal.test(total_phages ~ mlst, data_d)
print(kwtest$p.value)

# Draw plot
E<- ggplot(data_d, aes(x= factor(mlst, levels = c("ST2","ST79","ST1","ST3","ST499","ST10","ST78","ST25","Other")), y = total_phages, fill= mlst))+ geom_violin(show.legend= FALSE, adjust= 1.5) + geom_boxplot(width = 0.12, colour ="black", outliers = FALSE, show.legend = FALSE) + theme_minimal()  + labs(x = "", y = "No. of prophages", fill = "MLSTs") +
  scale_y_continuous(limits = c(0, 9), breaks = seq(0, 9, by = 2)) + theme(axis.text = element_text(size = 14), axis.title.x = element_blank(), axis.title.y = element_text(size = 12)) + scale_fill_manual(values = c("#808080","#fb7676","#85d385","#7d92d0","#af7bd6","goldenrod1","#facbd2","#fdb56e","#c29474")) + geom_text(data=st_counts, aes(x=mlst, y=Inf, label = paste0("n = ",n)),vjust = c(rep(5,3),2.5,rep(5,5)), hjust =0.7, size = 2.5)

# PANEL F 
# Loading data  
mlst_ph <- read.table(file="mlst_phages_counts.tsv", header=T, sep="\t")
mlst_ph_freq <- read.table(file="mlst_phages_freq.tsv", header = T, sep="\t")

# Sort prophages in x-axis (according to the phylogeny)
col_vector <- c("DgiS1","P1003","P1017","P1055","P1231","P1233","P1311","P1023","P1165","P1068","P1184","P1074","P1372","P1331","P1076",
                "P1012","P1179","P1151","P1054","P1251","P1371","P1041","P1050","P1011","P1002","P1009","P1240","P1471","P1075","P1105","P1339",
                "P18","P2140","P1035","P1161","P1255","P1491","P1661","P1140","P1167","P1059","P1115","P1114","P1031","P1310","P1183","P1107")

# Reshape the counts matrix from wide to long format
counts_long <- mlst_ph %>%
  pivot_longer(cols = -X, names_to = "Phage", values_to = "Abs")

# Reshape the frequencies matrix from wide to long format
freq_long <- mlst_ph_freq %>%
  pivot_longer(cols = -X, names_to = "Phage", values_to = "Rel")

# Merge the long-format dataframes on MLST and Phage
merge_data <- merge(counts_long, freq_long, by = c("X", "Phage"))

# Filter out rows where Counts or Freq are zero
filtered_data <- merge_data %>%
  filter(Abs != 0, Rel != 0)

F<- ggplot(data = filtered_data, aes(x=factor(Phage, levels =col_vector),y= factor(X,levels = c("ST25","ST78","ST10","ST499","ST3","ST1","ST79","ST2")), color=Rel))+ geom_point(aes(size=Abs)) + theme_minimal() + 
  scale_color_gradient2(low = "#FFE1FF",mid = "#912CEE", high ="#551A8B", midpoint= 60, limits = c(0,100),breaks = c(0,25,50,75,100), labels= c(0,25,50,75,100)) +  scale_size_continuous(range = c(3,14), breaks=c(0,10,100,500,1000,3000), name="No. of prophages")+ labs(color="Relative freq.")+
  theme(axis.text.x = element_text(angle=60,hjust=1, size = 13), axis.title.x = element_blank(), axis.title.y =  element_blank(), axis.text.y = element_text(size =13))+  scale_x_discrete(expand = c(0.05, 0)) 


# PANEL G
# Loading files 
group1<- readLines("./phylogeny_R/ggtree_g1.ab")
group2<- readLines("./phylogeny_R/ggtree_g2.ab")
data_g1 <- data_a[data_a$X %in% group1,] 
data_g2 <- data_a[data_a$X %in% group2,] 

# Merge data
data_g1$Unnamed..0 <- "Group 1"
data_g2$Unnamed..0 <- "Group 2"
data_g <- merge(data_g1,data_g2, all=TRUE)
g_count <- data_g %>% group_by(Unnamed..0) %>% tally()

# Satistics
print(shapiro.test(data_g1$total_phages))
print(shapiro.test(data_g2$total_phages))

wt <- wilcox.test(data_g1$total_phages, data_g2$total_phages)
print(wt$p.value)

# Draw plot
G<-ggplot(data_g, aes(x = Unnamed..0, y=total_phages, fill = Unnamed..0)) + geom_violin(show.legend= FALSE, adjust= 1.25, width = 0.5) + geom_boxplot(width = 0.12, colour ="black", outliers = FALSE, show.legend = FALSE) + 
  xlab("") + ylab("No. of prophages") + scale_fill_manual(values = c("#EE8262","#BFEFFF")) + geom_hline(yintercept = mean(data_a$total_phages),linetype = "dashed", color = "black", linewidth= 1) + scale_y_continuous(limits = c(0, 9), breaks = seq(0, 9, by = 2)) +
  theme_minimal() + theme(axis.title = element_text(size = 13), axis.text = element_text(size = 14)) + geom_text(data = g_count, aes(x=Unnamed..0, y = Inf, label= paste0("n = ",n)), vjust = 5, hjust = -0.5, size = 4)



library(cowplot)
empty_plot <- ggplot()+theme_minimal()
ABC <- plot_grid(A,B,C, ncol = 3, labels = c("A","B","C"), rel_widths = c(3,6,5))
EF <- plot_grid(E,F,G, ncol=3, labels = c("E","F","G"), rel_widths = c(5,9,4))
def<- plot_grid(ABC, fig, EF, labels = c("","D", ""), ncol = 1, rel_heights = c(6,10,5.5))

pdf("fig3.pdf", width = 21, height = 18.5, paper = "special")
print(def)
dev.off()